# import packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# read in matched data from the 7 relevant states
states = ['TN', 'SC', 'NC', 'LA', 'GA', 'FL', 'AL']

all_states = {}
for state in states:
    dat = pd.read_csv('/REDACTED/tpv_match_out/tp_voter_restrict' + state + '.csv')
    all_states[state] = dat


al = all_states['AL']
fl = all_states['FL']
ga = all_states['GA']
la = all_states['LA']
nc = all_states['NC']
sc = all_states['SC']
tn = all_states['TN']

# read in large individual dataset, merge with matched voters in each state
keep_cols = ['taxpayer_id_new', 'taxpayer_id', 'predicted_prob_black', 'aud_no_research_audits', 'isEIC']
individual2014 = pd.read_csv("/REDACTED/data/final/individualBISG2014_full_final.csv", usecols=keep_cols)

al['taxpayer_id_new'] = al['taxpayer_id']
fl['taxpayer_id_new'] = fl['taxpayer_id']
ga['taxpayer_id_new'] = ga['taxpayer_id']
la['taxpayer_id_new'] = la['taxpayer_id']
nc['taxpayer_id_new'] = nc['taxpayer_id']
sc['taxpayer_id_new'] = sc['taxpayer_id']
tn['taxpayer_id_new'] = tn['taxpayer_id']


al_m = pd.merge(al, individual2014, on='taxpayer_id_new')
fl_m = pd.merge(fl, individual2014, on='taxpayer_id_new')
ga_m = pd.merge(ga, individual2014, on='taxpayer_id_new')
la_m = pd.merge(la, individual2014, on='taxpayer_id_new')
nc_m = pd.merge(nc, individual2014, on='taxpayer_id_new')
sc_m = pd.merge(sc, individual2014, on='taxpayer_id_new')
tn_m = pd.merge(tn, individual2014, on='taxpayer_id_new')


# resolve duplicate predicted_prob_black column
al_m['predicted_prob_black'] = al_m['predicted_prob_black_y']
al_m = al_m.drop(columns = ['predicted_prob_black_x', 'predicted_prob_black_y'])
fl_m['predicted_prob_black'] = fl_m['predicted_prob_black_y']
fl_m = fl_m.drop(columns = ['predicted_prob_black_x', 'predicted_prob_black_y'])
ga_m['predicted_prob_black'] = ga_m['predicted_prob_black_y']
ga_m = ga_m.drop(columns = ['predicted_prob_black_x', 'predicted_prob_black_y'])
la_m['predicted_prob_black'] = la_m['predicted_prob_black_y']
la_m = la_m.drop(columns = ['predicted_prob_black_x', 'predicted_prob_black_y'])
nc_m['predicted_prob_black'] = nc_m['predicted_prob_black_y']
nc_m = nc_m.drop(columns = ['predicted_prob_black_x', 'predicted_prob_black_y'])
sc_m['predicted_prob_black'] = sc_m['predicted_prob_black_y']
sc_m = sc_m.drop(columns = ['predicted_prob_black_x', 'predicted_prob_black_y'])
tn_m['predicted_prob_black'] = tn_m['predicted_prob_black_y']
tn_m = tn_m.drop(columns = ['predicted_prob_black_x', 'predicted_prob_black_y'])

# create variables for rounded bifsg scores and unit weight
al_m['p_black_rd'] = al_m['predicted_prob_black'].round(2)
fl_m['p_black_rd'] = fl_m['predicted_prob_black'].round(2)
ga_m['p_black_rd'] = ga_m['predicted_prob_black'].round(2)
la_m['p_black_rd'] = la_m['predicted_prob_black'].round(2)
nc_m['p_black_rd'] = nc_m['predicted_prob_black'].round(2)
sc_m['p_black_rd'] = sc_m['predicted_prob_black'].round(2)
tn_m['p_black_rd'] = tn_m['predicted_prob_black'].round(2)

al_m['unitwt'] = 1
fl_m['unitwt'] = 1
ga_m['unitwt'] = 1
la_m['unitwt'] = 1
nc_m['unitwt'] = 1
sc_m['unitwt'] = 1
tn_m['unitwt'] = 1

# filter to individuals with ground truth race labels, assign black indicator variable
al_mf = al_m[al_m.race_ethn.notnull()]
fl_mf = fl_m[fl_m.race_ethn.notnull()]
ga_mf = ga_m[ga_m.race_ethn.notnull()]
la_mf = la_m[la_m.race_ethn.notnull()]
nc_mf = nc_m[nc_m.race_ethn.notnull()]
sc_mf = sc_m[sc_m.race_ethn.notnull()]
tn_mf = tn_m[tn_m.race_ethn.notnull()]

al_mf['black_ind'] = np.where(al_mf['race_ethn'] == 'African or Af-Am Self Reported', 1, 0)
fl_mf['black_ind'] = np.where(fl_mf['race_ethn'] == 'African or Af-Am Self Reported', 1, 0)
ga_mf['black_ind'] = np.where(ga_mf['race_ethn'] == 'African or Af-Am Self Reported', 1, 0)
la_mf['black_ind'] = np.where(la_mf['race_ethn'] == 'African or Af-Am Self Reported', 1, 0)
nc_mf['black_ind'] = np.where(nc_mf['race_ethn'] == 'African or Af-Am Self Reported', 1, 0)
sc_mf['black_ind'] = np.where(sc_mf['race_ethn'] == 'African or Af-Am Self Reported', 1, 0)
tn_mf['black_ind'] = np.where(tn_mf['race_ethn'] == 'African or Af-Am Self Reported', 1, 0)

### Compute covariance conditions within each matched dataset
import sys
sys.path.insert(1,'/REDACTED/fairness/code/utilities')
import estimators as et
## AL
varlist = ['Metrics', 'Full Pop Unweighted', 'EITC Unweighted', 'Non-EITC Unweighted']
al_output = pd.DataFrame(columns=varlist)
al_output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~al_mf.taxpayer_id_new.isna(), al_mf.isEIC==1, al_mf.isEIC==0]

# compute covariance conditions in each group
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = al_mf[filters[i]]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(0)
        al_output[varlist[i]] = column

al_output.loc[len(al_output)] = ["N", len(al_mf), (al_mf.isEIC==1).sum(), (al_mf.isEIC==0).sum()]

al_output.to_latex('/REDACTED/residual_covariance_terms_SE_AL_V2.tex', index=False)

## FL

varlist = ['Metrics', 'Full Pop Unweighted', 'EITC Unweighted', 'Non-EITC Unweighted']
fl_output = pd.DataFrame(columns=varlist)
fl_output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~fl_mf.taxpayer_id_new.isna(), fl_mf.isEIC==1, fl_mf.isEIC==0]

# compute covariance conditions in each group
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = fl_mf[filters[i]]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(0)
        fl_output[varlist[i]] = column

fl_output.loc[len(fl_output)] = ["N", len(fl_mf), (fl_mf.isEIC==1).sum(), (fl_mf.isEIC==0).sum()]

fl_output.to_latex('/REDACTED/residual_covariance_terms_SE_FL_V2.tex', index=False)

## GA

varlist = ['Metrics', 'Full Pop Unweighted', 'EITC Unweighted', 'Non-EITC Unweighted']
ga_output = pd.DataFrame(columns=varlist)
ga_output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~ga_mf.taxpayer_id_new.isna(), ga_mf.isEIC==1, ga_mf.isEIC==0]

# compute covariance conditions in each group
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = ga_mf[filters[i]]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(0)
        ga_output[varlist[i]] = column

ga_output.loc[len(ga_output)] = ["N", len(ga_mf), (ga_mf.isEIC==1).sum(), (ga_mf.isEIC==0).sum()]

ga_output.to_latex('/REDACTED/residual_covariance_terms_SE_GA_V2.tex', index=False)

## LA

varlist = ['Metrics', 'Full Pop Unweighted', 'EITC Unweighted', 'Non-EITC Unweighted']
la_output = pd.DataFrame(columns=varlist)
la_output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~la_mf.taxpayer_id_new.isna(), la_mf.isEIC==1, la_mf.isEIC==0]

# compute covariance conditions in each group
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = la_mf[filters[i]]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(0)
        la_output[varlist[i]] = column

la_output.loc[len(la_output)] = ["N", len(la_mf), (la_mf.isEIC==1).sum(), (la_mf.isEIC==0).sum()]

la_output.to_latex('/REDACTED/residual_covariance_terms_SE_LA_V2.tex', index=False)

## NC

varlist = ['Metrics', 'Full Pop Unweighted', 'EITC Unweighted', 'Non-EITC Unweighted']
nc_output = pd.DataFrame(columns=varlist)
nc_output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~nc_mf.taxpayer_id_new.isna(), nc_mf.isEIC==1, nc_mf.isEIC==0]

# compute covariance conditions in each group
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = nc_mf[filters[i]]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(0)
        nc_output[varlist[i]] = column

nc_output.loc[len(nc_output)] = ["N", len(nc_mf), (nc_mf.isEIC==1).sum(), (nc_mf.isEIC==0).sum()]

nc_output.to_latex('/REDACTED/residual_covariance_terms_SE_NC_round2_V2.tex', index=False)

## SC

varlist = ['Metrics', 'Full Pop Unweighted', 'EITC Unweighted', 'Non-EITC Unweighted']
sc_output = pd.DataFrame(columns=varlist)
sc_output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~sc_mf.taxpayer_id_new.isna(), sc_mf.isEIC==1, sc_mf.isEIC==0]

# compute covariance conditions in each group
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = sc_mf[filters[i]]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(0)
        sc_output[varlist[i]] = column

sc_output.loc[len(sc_output)] = ["N", len(sc_mf), (sc_mf.isEIC==1).sum(), (sc_mf.isEIC==0).sum()]

sc_output.to_latex('/REDACTED/residual_covariance_terms_SE_SC_V2.tex', index=False)

## TN

varlist = ['Metrics', 'Full Pop Unweighted', 'EITC Unweighted', 'Non-EITC Unweighted']
tn_output = pd.DataFrame(columns=varlist)
tn_output['Metrics'] = ['E_cov_cond_b', 'E_cov_cond_b_se', 'E_cov_cond_B', 'E_cov_cond_B_se', 'nc_weights']
filters = [None, ~tn_mf.taxpayer_id_new.isna(), tn_mf.isEIC==1, tn_mf.isEIC==0]

# compute covariance conditions in each group
for i in range(len(varlist)):
    print(i)
    if varlist[i] == 'Metrics':
        pass
    else:
        column = []
        dat = tn_mf[filters[i]]
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='black_ind', conditioningvarb='p_black_rd')
        column.append(est*10000)
        column.append(se*10000)
        est,se,tstat,pval,_ = et.computeExpectedCovariance(dat, weightvarb='unitwt', outcomevarb='aud_no_research_audits', covarb='predicted_prob_black', conditioningvarb='black_ind')
        column.append(est*10000)
        column.append(se*10000)
        column.append(0)
        tn_output[varlist[i]] = column

tn_output.loc[len(tn_output)] = ["N", len(tn_mf), (tn_mf.isEIC==1).sum(), (tn_mf.isEIC==0).sum()]

tn_output.to_latex('/REDACTED/residual_covariance_terms_SE_TN_V2.tex', index=False)

# re-format output dataframes for plottaxpayer_idg
al_massaged = al_output.set_index('Metrics').transpose().reset_index()
al_massaged = al_massaged.rename(columns={'index':'pop'})
al_massaged['state'] = 'AL'

fl_massaged = fl_output.set_index('Metrics').transpose().reset_index()
fl_massaged = fl_massaged.rename(columns={'index':'pop'})
fl_massaged['state'] = 'FL'

ga_massaged = ga_output.set_index('Metrics').transpose().reset_index()
ga_massaged = ga_massaged.rename(columns={'index':'pop'})
ga_massaged['state'] = 'GA'

la_massaged = la_output.set_index('Metrics').transpose().reset_index()
la_massaged = la_massaged.rename(columns={'index':'pop'})
la_massaged['state'] = 'LA'

nc_massaged = nc_output.set_index('Metrics').transpose().reset_index()
nc_massaged = nc_massaged.rename(columns={'index':'pop'})
nc_massaged['state'] = 'NC'

sc_massaged = sc_output.set_index('Metrics').transpose().reset_index()
sc_massaged = sc_massaged.rename(columns={'index':'pop'})
sc_massaged['state'] = 'SC'

tn_massaged = tn_output.set_index('Metrics').transpose().reset_index()
tn_massaged = tn_massaged.rename(columns={'index':'pop'})
tn_massaged['state'] = 'TN'

# stitch together different states
all_massaged = pd.concat([al_massaged, fl_massaged, ga_massaged, la_massaged, nc_massaged, sc_massaged, tn_massaged])

# first plot: covariance conditions for full population
full_pop = all_massaged[all_massaged['pop'] == 'Full Pop Unweighted']

fig,ax = plt.subplots(1,1,sharex = True, sharey = True, figsize=(5,10))
x = full_pop['state']
y_b = full_pop['E_cov_cond_b']
yerr_b = full_pop['E_cov_cond_b_se'] * 1.96
y_B = full_pop['E_cov_cond_B']
yerr_B = full_pop['E_cov_cond_B_se'] * 1.96


plt.errorbar(x, y_b, yerr=yerr_b, fmt="o", capsize = 9, label = "E[cov(Y,B)|b]")
plt.errorbar(x, y_B, yerr=yerr_B, fmt="o", capsize = 9, label = "E[cov(Y,b)|B]")
plt.xlabel('State', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.ylabel('Estimated Covariance', fontsize=16)
plt.title('Full Population', fontsize=16)
ax.axhline(y = 0, color = 'red', linestyle = 'dotted')
plt.legend(fontsize=13)

plt.savefig('/REDACTED/cov_conditions_states_V2.png', facecolor='white', bbox_inches='tight')
plt.close()

# second plot: covariance conditions for eitc population
eic = all_massaged[all_massaged['pop'] == 'EITC Unweighted']

fig,ax = plt.subplots(1,1,sharex = True, sharey = True, figsize=(5,10))
x = eic['state']
y_b = eic['E_cov_cond_b']
yerr_b = eic['E_cov_cond_b_se'] * 1.96
y_B = eic['E_cov_cond_B']
yerr_B = eic['E_cov_cond_B_se'] * 1.96


plt.errorbar(x, y_b, yerr=yerr_b, fmt="o", capsize = 9, label = "E[cov(Y,B)|b]")
plt.errorbar(x, y_B, yerr=yerr_B, fmt="o", capsize = 9, label = "E[cov(Y,b)|B]")
plt.title('EITC', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('State', fontsize=16)
plt.ylabel('Estimated Covariance', fontsize=16)
ax.axhline(y = 0, color = 'red', linestyle = 'dotted')
plt.legend(fontsize=13)

plt.savefig('/REDACTED/cov_conditions_states_eic_V2.png', facecolor='white', bbox_inches='tight')
plt.close()

# third plot: covariance conditions for non-eitc population
non_eic = all_massaged[all_massaged['pop'] == 'Non-EITC Unweighted']

fig,ax = plt.subplots(1,1,sharex = True, sharey = True, figsize=(5,10))
x = non_eic['state']
y_b = non_eic['E_cov_cond_b']
yerr_b = non_eic['E_cov_cond_b_se'] * 1.96
y_B = non_eic['E_cov_cond_B']
yerr_B = non_eic['E_cov_cond_B_se'] * 1.96


plt.errorbar(x, y_b, yerr=yerr_b, fmt="o", capsize = 9, label = "E[cov(Y,B)|b]")
plt.errorbar(x, y_B, yerr=yerr_B, fmt="o", capsize = 9, label = "E[cov(Y,b)|B]")
plt.title('Non-EITC', fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel('State', fontsize=16)
plt.ylabel('Estimated Covariance', fontsize=16)
ax.axhline(y = 0, color = 'red', linestyle = 'dotted')
plt.legend(fontsize=13)

plt.savefig('/REDACTED/cov_conditions_states_non_eic_V2.png', facecolor='white', bbox_inches='tight')
plt.close()
